Business case - Loan data
# Tidyverse
library(tidyverse)
library(vroom)
# Data Table
library(data.table)
# Counter
library(tictoc)
# 2.0 DATA IMPORT ----
# 2.1 Loan Acquisitions Data ----
col_types_acq <- list(
loan_id = col_factor(),
original_channel = col_factor(NULL),
seller_name = col_factor(NULL),
original_interest_rate = col_double(),
original_upb = col_integer(),
original_loan_term = col_integer(),
original_date = col_date("%m/%Y"),
first_pay_date = col_date("%m/%Y"),
original_ltv = col_double(),
original_cltv = col_double(),
number_of_borrowers = col_double(),
original_dti = col_double(),
original_borrower_credit_score = col_double(),
first_time_home_buyer = col_factor(NULL),
loan_purpose = col_factor(NULL),
property_type = col_factor(NULL),
number_of_units = col_integer(),
occupancy_status = col_factor(NULL),
property_state = col_factor(NULL),
zip = col_integer(),
primary_mortgage_insurance_percent = col_double(),
product_type = col_factor(NULL),
original_coborrower_credit_score = col_double(),
mortgage_insurance_type = col_double(),
relocation_mortgage_indicator = col_factor(NULL))
acquisition_data <- vroom(
file = "00_data/loan_data/Acquisition_2019Q1.txt",
delim = "|",
col_names = names(col_types_acq),
col_types = col_types_acq,
na = c("", "NA", "NULL"))
acquisition_data %>% glimpse()
## Rows: 297,452
## Columns: 25
## $ loan_id <fct> 100000913397, 100017539727, 1000180~
## $ original_channel <fct> C, B, R, C, B, C, R, R, R, B, R, C,~
## $ seller_name <fct> "JPMORGAN CHASE BANK, NATIONAL ASSO~
## $ original_interest_rate <dbl> 5.875, 4.750, 4.875, 4.875, 4.250, ~
## $ original_upb <int> 324000, 307000, 256000, 248000, 490~
## $ original_loan_term <int> 360, 360, 360, 360, 360, 360, 360, ~
## $ original_date <date> 2018-09-01, 2018-12-01, 2018-11-01~
## $ first_pay_date <date> 2018-11-01, 2019-02-01, 2019-01-01~
## $ original_ltv <dbl> 80, 90, 90, 90, 67, 69, 95, 80, 97,~
## $ original_cltv <dbl> 80, 90, 90, 90, 67, 69, 95, 80, 97,~
## $ number_of_borrowers <dbl> 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1,~
## $ original_dti <dbl> 49, 44, 41, 40, 35, 31, 45, 47, 37,~
## $ original_borrower_credit_score <dbl> 692, 722, 728, 730, 727, 798, 710, ~
## $ first_time_home_buyer <fct> N, N, N, Y, Y, N, N, N, Y, Y, N, N,~
## $ loan_purpose <fct> C, P, P, P, P, P, P, P, P, P, P, P,~
## $ property_type <fct> PU, PU, SF, SF, CO, PU, PU, CO, SF,~
## $ number_of_units <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ occupancy_status <fct> P, P, S, P, P, P, P, P, P, P, P, P,~
## $ property_state <fct> CA, TX, NC, IL, CA, FL, TX, TX, IN,~
## $ zip <int> 925, 770, 286, 600, 945, 337, 773, ~
## $ primary_mortgage_insurance_percent <dbl> NA, 25, 25, 25, NA, NA, 30, NA, 35,~
## $ product_type <fct> FRM, FRM, FRM, FRM, FRM, FRM, FRM, ~
## $ original_coborrower_credit_score <dbl> 665, NA, 738, 791, NA, 810, 778, NA~
## $ mortgage_insurance_type <dbl> NA, 1, 1, 1, NA, NA, 1, NA, 1, 1, 1~
## $ relocation_mortgage_indicator <fct> N, N, N, N, N, N, Y, N, N, N, N, N,~
# 2.2 Performance Data ----
col_types_perf = list(
loan_id = col_factor(),
monthly_reporting_period = col_date("%m/%d/%Y"),
servicer_name = col_factor(NULL),
current_interest_rate = col_double(),
current_upb = col_double(),
loan_age = col_double(),
remaining_months_to_legal_maturity = col_double(),
adj_remaining_months_to_maturity = col_double(),
maturity_date = col_date("%m/%Y"),
msa = col_double(),
current_loan_delinquency_status = col_double(),
modification_flag = col_factor(NULL),
zero_balance_code = col_factor(NULL),
zero_balance_effective_date = col_date("%m/%Y"),
last_paid_installment_date = col_date("%m/%d/%Y"),
foreclosed_after = col_date("%m/%d/%Y"),
disposition_date = col_date("%m/%d/%Y"),
foreclosure_costs = col_double(),
prop_preservation_and_repair_costs = col_double(),
asset_recovery_costs = col_double(),
misc_holding_expenses = col_double(),
holding_taxes = col_double(),
net_sale_proceeds = col_double(),
credit_enhancement_proceeds = col_double(),
repurchase_make_whole_proceeds = col_double(),
other_foreclosure_proceeds = col_double(),
non_interest_bearing_upb = col_double(),
principal_forgiveness_upb = col_double(),
repurchase_make_whole_proceeds_flag = col_factor(NULL),
foreclosure_principal_write_off_amount = col_double(),
servicing_activity_indicator = col_factor(NULL))
performance_data <- vroom(
file = "00_data/loan_data/Performance_2019Q1.txt",
delim = "|",
col_names = names(col_types_perf),
col_types = col_types_perf,
na = c("", "NA", "NULL"))
#performance_data %>% glimpse()
# 3.1 Acquisition Data ----
class(acquisition_data)
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
setDT(acquisition_data)
class(acquisition_data)
## [1] "data.table" "data.frame"
#acquisition_data %>% glimpse()
# 3.2 Performance Data ----
setDT(performance_data)
#performance_data %>% glimpse()
# 4.0 DATA WRANGLING ----
# 4.1 Joining / Merging Data ----
tic()
combined_data <- merge(x = acquisition_data, y = performance_data,
by = "loan_id",
all.x = TRUE,
all.y = FALSE)
toc()
## 1.53 sec elapsed
combined_data %>% glimpse()
## Rows: 3,105,040
## Columns: 55
## $ loan_id <fct> 100000913397, 100000913397, 100~
## $ original_channel <fct> C, C, C, C, C, C, C, C, C, C, C~
## $ seller_name <fct> "JPMORGAN CHASE BANK, NATIONAL ~
## $ original_interest_rate <dbl> 5.875, 5.875, 5.875, 5.875, 5.8~
## $ original_upb <int> 324000, 324000, 324000, 324000,~
## $ original_loan_term <int> 360, 360, 360, 360, 360, 360, 3~
## $ original_date <date> 2018-09-01, 2018-09-01, 2018-0~
## $ first_pay_date <date> 2018-11-01, 2018-11-01, 2018-1~
## $ original_ltv <dbl> 80, 80, 80, 80, 80, 80, 80, 80,~
## $ original_cltv <dbl> 80, 80, 80, 80, 80, 80, 80, 80,~
## $ number_of_borrowers <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2~
## $ original_dti <dbl> 49, 49, 49, 49, 49, 49, 49, 49,~
## $ original_borrower_credit_score <dbl> 692, 692, 692, 692, 692, 692, 6~
## $ first_time_home_buyer <fct> N, N, N, N, N, N, N, N, N, N, N~
## $ loan_purpose <fct> C, C, C, C, C, C, C, C, C, C, C~
## $ property_type <fct> PU, PU, PU, PU, PU, PU, PU, PU,~
## $ number_of_units <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ occupancy_status <fct> P, P, P, P, P, P, P, P, P, P, P~
## $ property_state <fct> CA, CA, CA, CA, CA, CA, CA, CA,~
## $ zip <int> 925, 925, 925, 925, 925, 925, 9~
## $ primary_mortgage_insurance_percent <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ product_type <fct> FRM, FRM, FRM, FRM, FRM, FRM, F~
## $ original_coborrower_credit_score <dbl> 665, 665, 665, 665, 665, 665, 6~
## $ mortgage_insurance_type <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ relocation_mortgage_indicator <fct> N, N, N, N, N, N, N, N, N, N, N~
## $ monthly_reporting_period <date> 2019-01-01, 2019-02-01, 2019-0~
## $ servicer_name <fct> "OTHER", "", "", "", "", "", ""~
## $ current_interest_rate <dbl> 5.875, 5.875, 5.875, 5.875, 5.8~
## $ current_upb <dbl> NA, NA, NA, NA, NA, NA, 320968.~
## $ loan_age <dbl> 3, 4, 5, 6, 7, 8, 9, 10, 11, 12~
## $ remaining_months_to_legal_maturity <dbl> 357, 356, 355, 354, 353, 352, 3~
## $ adj_remaining_months_to_maturity <dbl> 357, 356, 355, 354, 353, 352, 3~
## $ maturity_date <date> 2048-10-01, 2048-10-01, 2048-1~
## $ msa <dbl> 40140, 40140, 40140, 40140, 401~
## $ current_loan_delinquency_status <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ modification_flag <fct> N, N, N, N, N, N, N, N, N, N, N~
## $ zero_balance_code <fct> , , , , , , , , , , , , , , , ,~
## $ zero_balance_effective_date <date> NA, NA, NA, NA, NA, NA, NA, NA~
## $ last_paid_installment_date <date> NA, NA, NA, NA, NA, NA, NA, NA~
## $ foreclosed_after <date> NA, NA, NA, NA, NA, NA, NA, NA~
## $ disposition_date <date> NA, NA, NA, NA, NA, NA, NA, NA~
## $ foreclosure_costs <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ prop_preservation_and_repair_costs <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ asset_recovery_costs <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ misc_holding_expenses <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ holding_taxes <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ net_sale_proceeds <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ credit_enhancement_proceeds <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ repurchase_make_whole_proceeds <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ other_foreclosure_proceeds <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ non_interest_bearing_upb <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ principal_forgiveness_upb <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ repurchase_make_whole_proceeds_flag <fct> , , , , , , , , , , , , , , , ,~
## $ foreclosure_principal_write_off_amount <dbl> NA, NA, NA, NA, NA, NA, NA, NA,~
## $ servicing_activity_indicator <fct> N, N, N, N, N, N, N, N, N, N, N~
# Same operation with dplyr
tic()
performance_data %>%
left_join(acquisition_data, by = "loan_id")
toc()
## 3.35 sec elapsed
# Preparing the Data Table
setkey(combined_data, "loan_id")
key(combined_data)
## [1] "loan_id"
#?setorder()
setorderv(combined_data, c("loan_id", "monthly_reporting_period"))
# 4.3 Select Columns ----
combined_data %>% dim()
## [1] 3105040 55
keep_cols <- c("loan_id",
"monthly_reporting_period",
"seller_name",
"current_interest_rate",
"current_upb",
"loan_age",
"remaining_months_to_legal_maturity",
"adj_remaining_months_to_maturity",
"current_loan_delinquency_status",
"modification_flag",
"zero_balance_code",
"foreclosure_costs",
"prop_preservation_and_repair_costs",
"asset_recovery_costs",
"misc_holding_expenses",
"holding_taxes",
"net_sale_proceeds",
"credit_enhancement_proceeds",
"repurchase_make_whole_proceeds",
"other_foreclosure_proceeds",
"non_interest_bearing_upb",
"principal_forgiveness_upb",
"repurchase_make_whole_proceeds_flag",
"foreclosure_principal_write_off_amount",
"servicing_activity_indicator",
"original_channel",
"original_interest_rate",
"original_upb",
"original_loan_term",
"original_ltv",
"original_cltv",
"number_of_borrowers",
"original_dti",
"original_borrower_credit_score",
"first_time_home_buyer",
"loan_purpose",
"property_type",
"number_of_units",
"property_state",
"occupancy_status",
"primary_mortgage_insurance_percent",
"product_type",
"original_coborrower_credit_score",
"mortgage_insurance_type",
"relocation_mortgage_indicator")
combined_data <- combined_data[, ..keep_cols]
combined_data %>% dim()
## [1] 3105040 45
#combined_data %>% glimpse()
# 4.4 Grouped Mutations ----
# - Add response variable (Predict wether loan will become delinquent in next 3 months)
# dplyr
tic()
temp <- combined_data %>%
group_by(loan_id) %>%
mutate(gt_1mo_behind_in_3mo_dplyr = lead(current_loan_delinquency_status, n = 3) >= 1) %>%
ungroup()
toc()
## 12.17 sec elapsed
combined_data %>% dim()
## [1] 3105040 45
temp %>% dim()
## [1] 3105040 46
# data.table
tic()
combined_data[, gt_1mo_behind_in_3mo := lead(current_loan_delinquency_status, n = 3) >= 1,
by = loan_id]
toc()
## 12 sec elapsed
combined_data %>% dim()
## [1] 3105040 46
# Remove the temp variable
rm(temp)
# 5.1 How many loans in a month ----
tic()
combined_data[!is.na(monthly_reporting_period), .N, by = monthly_reporting_period]
toc()
## 0.25 sec elapsed
tic()
combined_data %>%
filter(!is.na(monthly_reporting_period)) %>%
count(monthly_reporting_period)
toc()
## 2.24 sec elapsed
# 5.2 Which loans have the most outstanding delinquencies ----
# data.table
tic()
combined_data[current_loan_delinquency_status >= 1,
list(loan_id, monthly_reporting_period, current_loan_delinquency_status, seller_name, current_upb)][
, max(current_loan_delinquency_status), by = loan_id][
order(V1, decreasing = TRUE)]
toc()
## 0.29 sec elapsed
# 5.3 Get last unpaid balance value for delinquent loans ----
# data.table
tic()
combined_data[current_loan_delinquency_status >= 1, .SD[.N], by = loan_id][
!is.na(current_upb)][
order(-current_upb), .(loan_id, monthly_reporting_period, current_loan_delinquency_status, seller_name, current_upb)
]
toc()
## 3.55 sec elapsed
# 5.4 Loan Companies with highest unpaid balance
# data.table
tic()
upb_by_company_dt <- combined_data[!is.na(current_upb), .SD[.N], by = loan_id][
, .(sum_current_upb = sum(current_upb, na.rm = TRUE), cnt_current_upb = .N), by = seller_name][
order(sum_current_upb, decreasing = TRUE)]
toc()
## 39.33 sec elapsed
upb_by_company_dt